{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Combining Multiple Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "import pandas as pd\n",
    "\n",
    "# Load individual datasets\n",
    "alpaca_dataset = load_dataset(\"tatsu-lab/alpaca\")\n",
    "chat_instruction_prompt_dataset = load_dataset(\"alespalla/chatbot_instruction_prompts\")\n",
    "dolly_databricks_dataset = load_dataset(\"databricks/databricks-dolly-15k\")\n",
    "long_form_questions_dataset = load_dataset(\"akoksal/LongForm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert individual splits to Pandas DataFrames\n",
    "df_alpaca = alpaca_dataset[\"train\"].to_pandas()\n",
    "df_chat_instruction_prompt_train = chat_instruction_prompt_dataset[\"train\"].to_pandas()\n",
    "df_chat_instruction_prompt_test = chat_instruction_prompt_dataset[\"test\"].to_pandas()\n",
    "df_dolly_databricks = dolly_databricks_dataset[\"train\"].to_pandas()\n",
    "df_long_form_train = long_form_questions_dataset[\"train\"].to_pandas()\n",
    "df_long_form_test = long_form_questions_dataset[\"test\"].to_pandas()\n",
    "df_long_form_validation = long_form_questions_dataset[\"validation\"].to_pandas()\n",
    "\n",
    "# Rename columns to \"instruction\" and \"output\" for consistency\n",
    "df_alpaca = df_alpaca.rename(columns={\"output\": \"output\"})\n",
    "df_chat_instruction_prompt_train = df_chat_instruction_prompt_train.rename(columns={\"prompt\": \"instruction\", \"response\": \"output\"})\n",
    "df_chat_instruction_prompt_test = df_chat_instruction_prompt_test.rename(columns={\"prompt\": \"instruction\", \"response\": \"output\"})\n",
    "df_dolly_databricks = df_dolly_databricks.rename(columns={\"instruction\": \"instruction\", \"response\": \"output\", \"context\": \"context\"})\n",
    "df_long_form_train = df_long_form_train.rename(columns={\"input\": \"instruction\", \"output\": \"output\"})\n",
    "df_long_form_test = df_long_form_test.rename(columns={\"input\": \"instruction\", \"output\": \"output\"})\n",
    "df_long_form_validation = df_long_form_validation.rename(columns={\"input\": \"instruction\", \"output\": \"output\"})\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add input or context columns to the end of each instruction\n",
    "def add_input_context(row):\n",
    "    if 'input' in row and pd.notna(row['input']) and row['input'] != \"\":\n",
    "        return row['instruction'] + \"    \" + \"here is the input \" + row['input']\n",
    "    elif 'context' in row and pd.notna(row['context']) and row['context'] != \"\":\n",
    "        return row['instruction'] + \"    \" + \"here is the input \" + row['context']\n",
    "    else:\n",
    "        return row['instruction']\n",
    "\n",
    "df_alpaca['instruction'] = df_alpaca.apply(add_input_context, axis=1)\n",
    "df_dolly_databricks['instruction'] = df_dolly_databricks.apply(add_input_context, axis=1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save individual DataFrames to CSV files and describe features\n",
    "df_alpaca.to_csv(\"alpaca_dataset.csv\", index=False)\n",
    "print(\"Saved alpaca_dataset.csv. Features:\", df_alpaca.columns)\n",
    "\n",
    "df_chat_instruction_prompt_train.to_csv(\"chat_instruction_prompt_train.csv\", index=False)\n",
    "print(\"Saved chat_instruction_prompt_train.csv. Features:\", df_chat_instruction_prompt_train.columns)\n",
    "\n",
    "df_chat_instruction_prompt_test.to_csv(\"chat_instruction_prompt_test.csv\", index=False)\n",
    "print(\"Saved chat_instruction_prompt_test.csv. Features:\", df_chat_instruction_prompt_test.columns)\n",
    "\n",
    "df_dolly_databricks.to_csv(\"dolly_databricks_dataset.csv\", index=False)\n",
    "print(\"Saved dolly_databricks_dataset.csv. Features:\", df_dolly_databricks.columns)\n",
    "\n",
    "df_long_form_train.to_csv(\"long_form_train_dataset.csv\", index=False)\n",
    "print(\"Saved long_form_train_dataset.csv. Features:\", df_long_form_train.columns)\n",
    "\n",
    "df_long_form_test.to_csv(\"long_form_test_dataset.csv\", index=False)\n",
    "print(\"Saved long_form_test_dataset.csv. Features:\", df_long_form_test.columns)\n",
    "\n",
    "df_long_form_validation.to_csv(\"long_form_validation_dataset.csv\", index=False)\n",
    "print(\"Saved long_form_validation_dataset.csv. Features:\", df_long_form_validation.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ensure each DataFrame has a unique index before concatenating\n",
    "df_alpaca.reset_index(drop=True, inplace=True)\n",
    "df_chat_instruction_prompt_train.reset_index(drop=True, inplace=True)\n",
    "df_chat_instruction_prompt_test.reset_index(drop=True, inplace=True)\n",
    "df_dolly_databricks.reset_index(drop=True, inplace=True)\n",
    "df_long_form_train.reset_index(drop=True, inplace=True)\n",
    "df_long_form_test.reset_index(drop=True, inplace=True)\n",
    "df_long_form_validation.reset_index(drop=True, inplace=True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Concatenate all DataFrames into a single DataFrame\n",
    "df_combined = pd.concat([df_alpaca, df_chat_instruction_prompt_train, \n",
    "                        df_chat_instruction_prompt_test, df_dolly_databricks, df_long_form_train,\n",
    "                        df_long_form_test, df_long_form_validation], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the combined DataFrame to a CSV file\n",
    "df_combined.to_csv(\"combined_dataset.csv\", index=False)\n",
    "\n",
    "# Print a message indicating that the CSV file has been saved\n",
    "print(\"Combined CSV file saved successfully.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_combined.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Keep only the \"instruction\" and \"output\" columns in df_combined\n",
    "df_combined_filtered = df_combined[['instruction', 'output']]\n",
    "\n",
    "# Save the filtered DataFrame to a new CSV file\n",
    "df_combined_filtered.to_csv(\"combined_dataset_filtered.csv\", index=False)\n",
    "\n",
    "# Print a message indicating that the filtered CSV file has been saved\n",
    "print(\"Filtered CSV file saved successfully.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df_combined_filtered.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dataset = load_dataset('csv', data_files='./combined_dataset_filtered.csv' , split='train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "from datasets import load_dataset\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dataset.push_to_hub(\"CognitiveLab/English_Instruction_Combined\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
